library("tidyverse")
library("tibble")
library("msigdbr")
library("ggplot2")
library("TCGAbiolinks")
library("RNAseqQC")
library("DESeq2")
library("ensembldb")
library("purrr")
library("magrittr")
library("vsn")
library("matrixStats")
library("dplyr")
library("grex")
Download gene expression data from The Cancer Genome Atlas (TCGA): -
TCGA-COAD
refers to the biospecimen data for colon
adenocarcinoma. - STAR - Counts
pertains to the raw
counts.
query_tumor <- GDCquery(
project = "TCGA-COAD",
data.category = "Transcriptome Profiling",
data.type = "Gene Expression Quantification",
experimental.strategy = "RNA-Seq",
workflow.type = "STAR - Counts",
access = "open",
sample.type = "Primary Tumor"
)
tumor <- getResults(query_tumor)
tumor
query_normal <- GDCquery(
project = "TCGA-COAD",
data.category = "Transcriptome Profiling",
data.type = "Gene Expression Quantification",
experimental.strategy = "RNA-Seq",
workflow.type = "STAR - Counts",
access = "open",
sample.type = "Solid Tissue Normal"
)
normal <- getResults(query_normal)
normal
Consider only samples with both normal and malignant tissues.
submitter_ids <- inner_join(tumor, normal, by = "cases.submitter_id") %>%
dplyr::select(cases.submitter_id)
tumor <- tumor %>%
dplyr::filter(cases.submitter_id %in% submitter_ids$cases.submitter_id)
normal <- normal %>%
dplyr::filter(cases.submitter_id %in% submitter_ids$cases.submitter_id)
samples <- rbind(tumor, normal)
invisible(unique(samples$sample_type))
samples
Download only samples with both normal and malignant tissues.
To impose this filtering, we set the barcode
argument of
GDCquery
to
samples$sample.submitter_id
(which was generated in the
previous cell).
query_coad <- GDCquery(
project = "TCGA-COAD",
data.category = "Transcriptome Profiling",
data.type = "Gene Expression Quantification",
experimental.strategy = "RNA-Seq",
workflow.type = "STAR - Counts",
access = "open",
sample.type = c("Solid Tissue Normal", "Primary Tumor"),
barcode = as.list(samples$sample.submitter_id)
)
If this is your first time running this notebook (i.e., you have not yet downloaded the results of the query in the previous block), uncomment the code block below.
# GDCdownload(query_coad)
Running the code block above should generate and populate a directory
named GDCdata
.
Construct the RNA-seq count matrix.
tcga_coad_data <- GDCprepare(query_coad, summarizedExperiment = TRUE)
count_matrix <- assay(tcga_coad_data, "unstranded")
# Remove duplicate entries
count_matrix_df <- data.frame(count_matrix)
count_matrix_df <- count_matrix_df[!duplicated(count_matrix_df), ]
count_matrix <- data.matrix(count_matrix_df)
rownames(count_matrix) <- cleanid(rownames(count_matrix))
count_matrix <- count_matrix[!(duplicated(rownames(count_matrix)) | duplicated(rownames(count_matrix), fromLast = TRUE)), ]
head(count_matrix[1:5, 1:4])
TCGA.AA.3522.01A.01R.0821.07 TCGA.A6.2678.01A.01R.0821.07 TCGA.A6.5665.01B.03R.2302.07 TCGA.AZ.6600.01A.11R.1774.07
ENSG00000000003 4069 4088 670 3195
ENSG00000000005 22 38 10 14
ENSG00000000419 1070 1577 535 2460
ENSG00000000457 386 625 425 624
ENSG00000000460 166 441 253 467
Format the samples
table so that it can be fed as input
to DESeq2.
rownames(samples) <- samples$cases
samples <- samples %>%
dplyr::select(case = "cases.submitter_id", type = "sample_type")
samples$type <- str_replace(samples$type, "Solid Tissue Normal", "normal")
samples$type <- str_replace(samples$type, "Primary Tumor", "tumor")
DESeq2 requires the row names of samples
should be
identical to the column names of count_matrix
.
colnames(count_matrix) <- gsub(x = colnames(count_matrix), pattern = "\\.", replacement = "-")
count_matrix <- count_matrix[, rownames(samples)]
# Sanity check
all(colnames(count_matrix) == rownames(samples))
References:
Construct the DESeqDataSet
object.
dds <- DESeqDataSetFromMatrix(
countData = count_matrix,
colData = samples,
design = ~type
)
Warning: some variables in design formula are characters, converting to factors
Display quality control (QC) plots (refer to https://cran.r-project.org/web/packages/RNAseqQC/vignettes/introduction.html)
plot_total_counts(dds)
plot_library_complexity(dds)
plot_gene_detection(dds)
Perform gene filtering.
We determined min_count
empirically by looking at the
red trend line in the variance stabilization plot. Ideally, this
trend line should be flat (i.e., stable).
min_count
resulted in a steep,
upward-sloping trend line.
dds <- filter_genes(dds, min_count = 10)
Transform the read counts.
From
https://chipster.csc.fi/manual/deseq2-transform.html:
You can use the resulting transformed values only for visualization
and clustering, not for differential expression analysis which needs
raw counts.
vsd <- vst(dds)
mean_sd_plot(vsd)
Check the clustering of the samples.
If you encounter the error
Error in loadNamespace(x) : there is no package called
'ComplexHeatmap'
, uncomment and run the following code block:
# install.packages("devtools", dependencies = TRUE)
# devtools::install_github("jokergoo/ComplexHeatmap")
set.seed(42)
plot_sample_clustering(vsd, anno_vars = c("type"), distance = "euclidean")
Perform principal component analysis (PCA).
plot_pca(vsd, PC_x = 1, PC_y = 2, shape_by = "type")
Refer to
1. Exploratory Data Analysis - MSigDB Gene Sets + GTEx TPM.Rmd
for more detailed documentation on obtaining the gene sets.
Fetch the necroptosis gene set.
necroptosis.genes <- msigdbr(species = "human", category = "C5", subcategory = "GO:BP") %>%
dplyr::filter(gs_name == "GOBP_NECROPTOTIC_SIGNALING_PATHWAY")
necroptosis.genes
Filter the genes to include only those in the necroptosis gene set.
rownames(necroptosis.genes) <- necroptosis.genes$ensembl_gene
coad_necroptosis <- count_matrix[rownames(count_matrix) %in% necroptosis.genes$ensembl_gene, ]
coad_necroptosis <- coad_necroptosis[, rownames(samples)]
# Check if all samples in the counts dataframe are in the samples dataframe
all(colnames(coad_necroptosis) == rownames(samples))
Perform differential gene expression analysis.
dds <- DESeqDataSetFromMatrix(
countData = coad_necroptosis,
colData = samples,
design = ~type
)
Warning: some variables in design formula are characters, converting to factors
dds <- filter_genes(dds, min_count = 10)
dds$type <- relevel(dds$type, ref = "normal")
dds <- DESeq(dds)
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
res <- results(dds)
summary(res)
out of 8 with nonzero total read count
adjusted p-value < 0.1
LFC > 0 (up) : 2, 25%
LFC < 0 (down) : 2, 25%
outliers [1] : 0, 0%
low counts [2] : 0, 0%
(mean count < 38)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
Prettify the display of results.
deseq.results <- res
deseq.bbl.data <- data.frame(
row.names = rownames(deseq.results),
baseMean = deseq.results$baseMean,
log2FoldChange = deseq.results$log2FoldChange,
lfcSE = deseq.results$lfcSE,
stat = deseq.results$stat,
pvalue = deseq.results$pvalue,
padj = deseq.results$padj,
cancer_type = "Colon",
gene_symbol = necroptosis.genes[rownames(deseq.results), "gene_symbol"]
)
deseq.bbl.data
Plot the results.
ggplot(deseq.bbl.data, aes(x = cancer_type, y = gene_symbol, size = padj, fill = log2FoldChange)) +
geom_point(alpha = 0.5, shape = 21, color = "black") +
scale_size(trans = "reverse") +
scale_fill_gradient2(low = "blue", mid = "white", high = "red", limits = c(min(deseq.bbl.data$log2FoldChange), max(deseq.bbl.data$log2FoldChange))) +
theme_minimal() +
theme(legend.position = "bottom") +
theme(legend.position = "bottom") +
labs(size = "FDR", fill = "log2 FC", x = "Cancer type", y = "Gene")
Fetch the ferroptosis gene set.
ferroptosis.genes <- msigdbr(species = "human", category = "C2", subcategory = "CP:WIKIPATHWAYS") %>%
dplyr::filter(gs_name == "WP_FERROPTOSIS")
ferroptosis.genes
Filter the genes to include only those in the ferroptosis gene set.
rownames(ferroptosis.genes) <- ferroptosis.genes$ensembl_gene
coad_ferroptosis <- count_matrix[rownames(count_matrix) %in% ferroptosis.genes$ensembl_gene, ]
coad_ferroptosis <- coad_ferroptosis[, rownames(samples)]
# Check if all samples in the counts dataframe are in the samples dataframe
all(colnames(coad_ferroptosis) == rownames(samples))
Perform differential gene expression analysis.
dds <- DESeqDataSetFromMatrix(
countData = coad_ferroptosis,
colData = samples,
design = ~type
)
Warning: some variables in design formula are characters, converting to factors
dds <- filter_genes(dds, min_count = 10)
dds$type <- relevel(dds$type, ref = "normal")
dds <- DESeq(dds)
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
-- replacing outliers and refitting for 2 genes
-- DESeq argument 'minReplicatesForReplace' = 7
-- original counts are preserved in counts(dds)
estimating dispersions
fitting model and testing
res <- results(dds)
summary(res)
out of 63 with nonzero total read count
adjusted p-value < 0.1
LFC > 0 (up) : 24, 38%
LFC < 0 (down) : 19, 30%
outliers [1] : 0, 0%
low counts [2] : 0, 0%
(mean count < 8)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
Prettify the display of results.
deseq.results <- res
deseq.bbl.data <- data.frame(
row.names = rownames(deseq.results),
baseMean = deseq.results$baseMean,
log2FoldChange = deseq.results$log2FoldChange,
lfcSE = deseq.results$lfcSE,
stat = deseq.results$stat,
pvalue = deseq.results$pvalue,
padj = deseq.results$padj,
cancer_type = "Colon",
gene_symbol = ferroptosis.genes[rownames(deseq.results), "gene_symbol"]
)
deseq.bbl.data
Plot the results.
ggplot(deseq.bbl.data, aes(x = cancer_type, y = gene_symbol, size = padj, fill = log2FoldChange)) +
geom_point(alpha = 0.5, shape = 21, color = "black") +
scale_size(trans = "reverse") +
scale_fill_gradient2(low = "blue", mid = "white", high = "red", limits = c(min(deseq.bbl.data$log2FoldChange), max(deseq.bbl.data$log2FoldChange))) +
theme_minimal() +
theme(legend.position = "bottom") +
theme(legend.position = "bottom") +
labs(size = "FDR", fill = "log2 FC", x = "Cancer type", y = "Gene")
Fetch the pyroptosis gene set.
pyroptosis.genes <- msigdbr(species = "human", category = "C2", subcategory = "CP:REACTOME") %>%
dplyr::filter(gs_name == "REACTOME_PYROPTOSIS")
pyroptosis.genes
Filter the genes to include only those in the pyroptosis gene set.
rownames(pyroptosis.genes) <- pyroptosis.genes$ensembl_gene
coad_pyroptosis <- count_matrix[rownames(count_matrix) %in% pyroptosis.genes$ensembl_gene, ]
coad_pyroptosis <- coad_pyroptosis[, rownames(samples)]
# Check if all samples in the counts dataframe are in the samples dataframe
all(colnames(coad_pyroptosis) == rownames(samples))
Perform differential gene expression analysis.
dds <- DESeqDataSetFromMatrix(
countData = coad_pyroptosis,
colData = samples,
design = ~type
)
Warning: some variables in design formula are characters, converting to factors
dds <- filter_genes(dds, min_count = 10)
dds$type <- relevel(dds$type, ref = "normal")
dds <- DESeq(dds)
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
-- replacing outliers and refitting for 1 genes
-- DESeq argument 'minReplicatesForReplace' = 7
-- original counts are preserved in counts(dds)
estimating dispersions
fitting model and testing
res <- results(dds)
summary(res)
out of 27 with nonzero total read count
adjusted p-value < 0.1
LFC > 0 (up) : 10, 37%
LFC < 0 (down) : 7, 26%
outliers [1] : 0, 0%
low counts [2] : 0, 0%
(mean count < 16)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
Prettify the display of results.
deseq.results <- res
deseq.bbl.data <- data.frame(
row.names = rownames(deseq.results),
baseMean = deseq.results$baseMean,
log2FoldChange = deseq.results$log2FoldChange,
lfcSE = deseq.results$lfcSE,
stat = deseq.results$stat,
pvalue = deseq.results$pvalue,
padj = deseq.results$padj,
cancer_type = "Colon",
gene_symbol = pyroptosis.genes[rownames(deseq.results), "gene_symbol"]
)
deseq.bbl.data
Plot the results.
ggplot(deseq.bbl.data, aes(x = cancer_type, y = gene_symbol, size = padj, fill = log2FoldChange)) +
geom_point(alpha = 0.5, shape = 21, color = "black") +
scale_size(trans = "reverse") +
scale_fill_gradient2(low = "blue", mid = "white", high = "red", limits = c(min(deseq.bbl.data$log2FoldChange), max(deseq.bbl.data$log2FoldChange))) +
theme_minimal() +
theme(legend.position = "bottom") +
theme(legend.position = "bottom") +
labs(size = "FDR", fill = "log2 FC", x = "Cancer type", y = "Gene")
De La Salle University, Manila, Philippines, kim_leejra@dlsu.edu.ph↩︎
De La Salle University, Manila, Philippines, gonzales.markedward@gmail.com↩︎
De La Salle University, Manila, Philippines, anish.shrestha@dlsu.edu.ph↩︎